library(macpie)
library(tibble)
library(stringr)
library(pheatmap)
library(ggiraph)
library(tidyseurat)
## Loading required package: ttservice
## 
## Attaching package: 'ttservice'
## The following objects are masked from 'package:macpie':
## 
##     bind_cols, bind_rows, plot_ly
## Loading required package: SeuratObject
## Loading required package: sp
## 
## Attaching package: 'SeuratObject'
## The following objects are masked from 'package:base':
## 
##     intersect, t
## ========================================
## tidyseurat version 0.8.0
## If you use TIDYSEURAT in published research, please cite:
## 
## Mangiola et al. Interfacing Seurat with the R tidy universe. Bioinformatics 2021.
## 
## This message can be suppressed by:
##   suppressPackageStartupMessages(library(tidyseurat))
##   
## To restore the Seurat default display use options("restore_Seurat_show" = TRUE) 
## ========================================
## 
## Attaching package: 'tidyseurat'
## The following object is masked from 'package:ttservice':
## 
##     plot_ly
library(purrr)
## 
## Attaching package: 'purrr'
## The following object is masked from 'package:testthat':
## 
##     is_null
library(DT)
## 
## Attaching package: 'DT'
## The following object is masked from 'package:SeuratObject':
## 
##     JS
## The following object is masked from 'package:macpie':
## 
##     JS
library(ggrepel)
library(ggalluvial) 
library(ggplot2)
library(ggvenn)
## Loading required package: grid
library(limma)
## 
## Attaching package: 'limma'
## The following object is masked from 'package:macpie':
## 
##     plotMA

DRUGseq results

# batch24 <- readRDS(paste0(dir,"DRUGseqData/Exp_batch24.Rds"))
# load(paste0(dir,"DRUGseqData/de.RData")) 
# batch24_de <- de_res%>%filter(batch_id=="24")
# saveRDS(batch24_de, file = paste0(dir,"DRUGseqData/DE_batch24.Rds"))
batch24_de <- readRDS(paste0(dir,"DRUGseqData/DE_batch24.Rds"))
head(batch24_de)
##   investigation_id analysis_id pipeline_run_key pipeline_group_run_key method
## 1             2384          24               27                     27  limma
## 2             2384          24               27                     27  limma
## 3             2384          24               27                     27  limma
## 4             2384          24               27                     27  limma
## 5             2384          24               27                     27  limma
## 6             2384          24               27                     27  limma
##                 normalization
## 1 quantile=0.75;TMM;log2(CPM)
## 2 quantile=0.75;TMM;log2(CPM)
## 3 quantile=0.75;TMM;log2(CPM)
## 4 quantile=0.75;TMM;log2(CPM)
## 5 quantile=0.75;TMM;log2(CPM)
## 6 quantile=0.75;TMM;log2(CPM)
##                                                  comparison_name_public
## 1 SA_UD-20-ER54_10uM_24.0_batchid24 vs RC_CB-43-EP73_0uM_24.0_batchid24
## 2 SA_UD-20-ER54_10uM_24.0_batchid24 vs RC_CB-43-EP73_0uM_24.0_batchid24
## 3 SA_UD-20-ER54_10uM_24.0_batchid24 vs RC_CB-43-EP73_0uM_24.0_batchid24
## 4 SA_UD-20-ER54_10uM_24.0_batchid24 vs RC_CB-43-EP73_0uM_24.0_batchid24
## 5 SA_UD-20-ER54_10uM_24.0_batchid24 vs RC_CB-43-EP73_0uM_24.0_batchid24
## 6 SA_UD-20-ER54_10uM_24.0_batchid24 vs RC_CB-43-EP73_0uM_24.0_batchid24
##   cmpd_sample_id concentration unit hours_post_treatment batch_id
## 1     UD-20-ER54            10   uM                   24       24
## 2     UD-20-ER54            10   uM                   24       24
## 3     UD-20-ER54            10   uM                   24       24
## 4     UD-20-ER54            10   uM                   24       24
## 5     UD-20-ER54            10   uM                   24       24
## 6     UD-20-ER54            10   uM                   24       24
##            gene.ID     logFC    adj.P.Val      P.Value   AveExpr         t
## 1   MYL6,grch38_12  1.726500 6.467501e-43 4.413170e-47 11.768734 15.518950
## 2   APRT,grch38_16  2.334604 2.164408e-24 2.953815e-28  8.730357 11.508262
## 3  KRT18,grch38_12  1.157740 1.327225e-15 2.716939e-19  9.335040  9.239790
## 4    SCD,grch38_10  1.199936 1.352286e-13 3.690990e-17  7.732632  8.636743
## 5  TUBB6,grch38_18 -1.005931 3.723618e-11 1.270426e-14  8.083550 -7.873065
## 6 TIMM8B,grch38_11  1.109810 3.654986e-10 1.496412e-13  7.957763  7.532087
FF86_res <- batch24_de %>% filter(cmpd_sample_id=="FF-86-NH56")
ff86_res_toptable <- FF86_res[,13:18]
ff86_res_toptable <- ff86_res_toptable %>% 
  separate(gene.ID, into = c("gene", "chrom"), sep = ",") %>%
  select(-chrom) %>% mutate(combined_id ="FF_86_NH56_10") %>%
  rename(log2FC=logFC, p_value_adj = adj.P.Val)
head(ff86_res_toptable)
##         gene    log2FC   p_value_adj       P.Value     AveExpr         t
## 1 AC122713.2  4.862957 1.397952e-191 9.539077e-196  0.04920401  41.95785
## 2      TRIP6 -3.328079 5.156775e-166 7.037564e-170  9.40624753 -37.18794
## 3   MIR7-3HG  8.513923 2.799813e-151 5.731449e-155  0.20792310  34.52594
## 4       TUBB -2.898698 1.317157e-145 3.595107e-149 11.65399547 -33.50503
## 5     CYP4F2  6.454706 7.730175e-142 2.637385e-145  0.10131583  32.82814
## 6       HDGF -3.828482 2.967781e-140 1.215059e-143  9.05238910 -32.53773
##     combined_id
## 1 FF_86_NH56_10
## 2 FF_86_NH56_10
## 3 FF_86_NH56_10
## 4 FF_86_NH56_10
## 5 FF_86_NH56_10
## 6 FF_86_NH56_10
plot_volcano(ff86_res_toptable, max.overlaps =5)
## Warning: ggrepel: 4342 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

ff86_res_toptable %>% filter(p_value_adj <0.05 & log2FC >0) %>% nrow()
## [1] 1423
drugseq_deg <- ff86_res_toptable %>% filter(p_value_adj <0.05 & log2FC >0) %>% select(gene) %>% pull()
#show histogram of p values
ggplot(ff86_res_toptable, aes(x = P.Value)) +
  geom_histogram(binwidth = 0.01, fill = "blue", color = "black", alpha = 0.7) +
  labs(title = "Histogram of Adjusted P-values", x = "Adjusted P-value", y = "Frequency") +
  theme_minimal()

macpie results

All DMSO wells

Load filered data

# mac_filtered <- filter_genes_by_expression(as_mac,
#                 group_by = "combined_id", min_counts = 10,
#                 min_samples = min_sample_num)

# saveRDS(mac_filtered,
# file = paste0(dir, "DRUGseqData/macpie_filtered_batch24.Rds"))


mac_filtered <- readRDS(paste0(dir, "/DRUGseqData/macpie_filtered_batch24.Rds"))
mac_filtered[["percent.mt"]] <- PercentageFeatureSet(mac_filtered, pattern = "^mt-|^MT-")
mac_filtered[["percent.ribo"]] <- PercentageFeatureSet(mac_filtered, pattern = "^Rp[slp][[:digit:]]|^Rpsa|^RP[SLP][[:digit:]]|^RPSA")
mac_filtered$combined_id <- str_replace_all(mac_filtered$combined_id, "-","_")

Correction of batch effect

According to DRUGseq metadata:

  • Wells with water are labelled as EC-27-RY89

  • Wells with DMSO are labelled as CB-43-EP73

# mac_filtered_cp <- mac_filtered %>% filter(combined_id %in% c("CB_43_EP73_0","FF_86_NH56_10"))
mac_filtered_cp <- mac_filtered %>% filter(combined_id %in% c("CB_43_EP73_0"))
plot_rle(mac_filtered_cp, label_column = "orig.ident", normalisation = "raw")+ scale_x_discrete(drop = FALSE) + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
## tidyseurat says: Key columns are missing. A data frame is returned for independent data analysis.
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.

plot_rle(mac_filtered_cp, label_column = "orig.ident", normalisation = "limma_voom")+ scale_x_discrete(drop = FALSE) + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
## tidyseurat says: Key columns are missing. A data frame is returned for independent data analysis.
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.

plot_rle(mac_filtered_cp, label_column = "orig.ident", normalisation = "DESeq2")+ scale_x_discrete(drop = FALSE) + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
## tidyseurat says: Key columns are missing. A data frame is returned for independent data analysis.
## converting counts to integer mode
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.

plot_rle(mac_filtered_cp, label_column = "orig.ident", normalisation = "edgeR")+ scale_x_discrete(drop = FALSE) + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
## tidyseurat says: Key columns are missing. A data frame is returned for independent data analysis.
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.

Note: instead of discussing which correction methods we should use for this data, we only show the ways we detected and corrected batch effect here. As batch effect adjustment for sequencing data has been implemented in different methods, such as DESeq2, RUVSeq, edgeR. We highly recommend users thoroughly checking any batch effects and exploring different methods.

In the next part of the vignette, we demonstrate a batch parameter has implemented in the differential expression test for batch correction.

##Limma trend 

# data <- mac_badDSMOremoved
# treatment_samples <- "FF_86_NH56_10"
# control_samples <- "CB_43_EP73_0"

 limma.trend<- function(data = data, treatment_samples, control_samples){
   data <- data[, grepl(paste0(treatment_samples, "|", control_samples), data$combined_id)]
   batch <- data$orig.ident
if (length(unique(data$combined_id)) < 2) {
      stop("Insufficient factors for differential expression analysis.")
    }
pheno_data <- data.frame(batch = as.factor(batch), condition = as.factor(data$combined_id))
combined_id <- data$combined_id
model_matrix <- if (length(batch) == 1) model.matrix(~0 + combined_id) else
      model.matrix(~0+combined_id + batch)

    dge <- DGEList(counts = data@assays$RNA$counts,
                   samples = pheno_data$condition,
                   group = pheno_data$condition)
    dge <- calcNormFactors(dge, method="TMMwsp")
    # keep <- edgeR::filterByExpr(dge, design = model_matrix)
    # dge  <- dge[keep, , keep.lib.sizes = FALSE]
    clean.TMM<-log2(edgeR::cpm(dge,  normalized.lib.sizes=T,log=F)+1) 
    myargs <- list(paste0("combined_id",
                          treatment_samples, "-",
                          paste0("combined_id", control_samples)),
                   levels = model_matrix)
    contrasts <- do.call(makeContrasts, myargs)
    
    lmfit <- lmFit(clean.TMM, model_matrix)
    lmfit <- contrasts.fit(lmfit, contrasts)
    lmfit <- eBayes(lmfit, trend = TRUE)
    #Warning: Zero sample variances detected, have been offset away from zero
    top_table<- topTable(lmfit, number = nrow(data)) %>%
      as.data.frame() %>%
      select("logFC", "t", "P.Value", "adj.P.Val") %>%
      rename("log2FC" = "logFC", "metric" = "t", "p_value" = "P.Value", "p_value_adj" = "adj.P.Val")%>%
      rownames_to_column("gene")

    return(top_table)}

    # top_table%>%filter(p_value_adj < 0.05 & log2FC>0)
# make a function of alluvial plot with ggalluvial to compare the ranking of DEGs from macpie and DRUGseq
plot_alluvial <- function(macpie_tbl, drugseq_tbl){
  df <- macpie_tbl %>%
  transmute(gene, p_value_adj_mac = p_value_adj, log2FC_mac = log2FC) %>%
  inner_join(
    drugseq_tbl %>% transmute(gene, p_value_adj_drg = p_value_adj, log2FC_drg = log2FC),
    by = "gene"
  )

#  Rank within each method by (padj, |log2FC|), tie-safe
rank_by_two <- function(padj, lfc) {
  ord <- order(padj, -abs(lfc), na.last = TRUE)
  r   <- integer(length(padj))
  r[ord] <- seq_along(ord)
  r
}
df <- df %>%
  mutate(
    rank_mac = rank_by_two(p_value_adj_mac, log2FC_mac),
    rank_drg = rank_by_two(p_value_adj_drg, log2FC_drg)
  )

# Define rank bins
cuts <- c(0, 25, 50, 100, 200, 300, 500, Inf)
labs <- c("Top25","26–50","51–100","101–200","201–300", "301-500",">500")

df <- df %>%
  mutate(
    mac_bin = cut(rank_mac, breaks = cuts, labels = labs, right = TRUE, include.lowest = TRUE),
    drg_bin = cut(rank_drg, breaks = cuts, labels = labs, right = TRUE, include.lowest = TRUE)
  )

# Ranking movement category (color)
bin_index <- function(x) match(x, labs)   # lower index = "more top"
df <- df %>%
  mutate(
    mac_idx = bin_index(mac_bin),
    drg_idx = bin_index(drg_bin),
    movement = case_when(
      is.na(mac_idx) | is.na(drg_idx) ~ NA_character_,
      mac_idx < drg_idx ~ "Higher rank in macpie",   # moved up into a more-top bin
      mac_idx > drg_idx ~ "Lower rank in macpie",    # moved down
      TRUE               ~ "Same rank"
    ),
    movement = factor(movement, levels = c("Higher rank in macpie","Same rank","Lower rank in macpie"))
  ) %>%
  tidyr::drop_na(mac_bin, drg_bin, movement)

#  Aggregate flows for alluvial
flows <- df %>%
  count(mac_bin, drg_bin, movement, name = "n") %>%
  mutate(
    mac_bin = factor(mac_bin, levels = labs),
    drg_bin = factor(drg_bin, levels = labs)
  )

ggplot(flows, aes(axis1 = mac_bin, axis2 = drg_bin, y = n)) +
  geom_alluvium(aes(fill = movement), alpha = 0.75, width = 0.14, knot.pos = 0.4) +
  geom_stratum(width = 0.14, color = "grey30") +
  geom_text(stat = "stratum", aes(label = after_stat(stratum)), size = 3) +
  scale_x_discrete(limits = c("Macpie rank", "DRUGseq rank"), expand = c(.05, .05)) +
  scale_fill_manual(values = c("Higher rank in macpie" = "orange",
                               "Same rank"        = "#7570b3",
                               "Lower rank in macpie" = "grey60")) +
  labs(x = NULL, y = "Gene count",
       fill = "Movement vs DRUGseq",
       subtitle = "Only DEGs with padj < 0.05 & log2FC>0 (both methods)") +
  theme_classic()
  
}

Differential gene expression

In here, you can specify a single condition in the combined_id column and compare with DMSO (i.e.CB_43_EP73_0). By using the plate IDs in the column of orig.ident as the input for batch parameter, compute_singe_de function can perform differential expression analysis using the preferred method (limma voom in this example) with batch information.

methods <- c("limma_voom", "DESeq2", "edgeR", "limma_trend")

methods_res <- lapply(methods, function(m){
  
  message("\n","Processing method: ", m,"\n") 
  # m<-"limma_voom"
  treatment_samples <- "FF_86_NH56_10"
  control_samples <- "CB_43_EP73_0"
  subset <- mac_filtered%>%filter(combined_id%in%c(treatment_samples,control_samples))

  batch <- subset$orig.ident
  if (m!="limma_trend"){
    subset <- filter_genes_by_expression(subset,
                           group_by = "combined_id", min_counts = 5,
                           min_samples = 1)
    top_table <- compute_single_de(subset, treatment_samples, control_samples, method = m, batch = batch)
  } else{
    top_table <- limma.trend(data = subset, treatment_samples = treatment_samples, control_samples = control_samples)
  }
  
  # plot(plot_volcano(top_table, max.overlaps = 5))
  alldmso_degs <- top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull()
  length(intersect(alldmso_degs, drugseq_deg))
  
  top_table <- top_table %>%
    arrange(p_value_adj, desc(log2FC)) %>%
    mutate(gene = factor(gene, levels = unique(gene)))
  # add a column if there are in the intersect(alldmso_degs, drugseq_deg)
  top_table <- top_table %>%
    mutate(in_drugseq_deg = ifelse(gene %in% intersect(alldmso_degs, drugseq_deg), "yes", "no"))
  
  plt_ecdf <- ggplot(top_table, aes(x = p_value_adj, color = in_drugseq_deg)) +
    stat_ecdf(size = 1) +
    scale_x_continuous(trans = "log10", breaks = c(1e-6,1e-4,5e-2)) +
    labs(x = "Adjusted p-value (log10 scale)", y = "ECDF",
         color = "In DRUGseq DEGs") +
    theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
    theme_classic()
  
  
  ks_test_results <- ks.test(top_table$p_value_adj[top_table$in_drugseq_deg=="yes"],
          top_table$p_value_adj[top_table$in_drugseq_deg=="no"], alternative = "greater")
  
  # ggplot(top_table, aes(x = log2FC, y = -log10(p_value_adj), color = in_drugseq_deg)) +
  #   geom_point(alpha = 0.6, size = 1.2) +
  #   geom_hline(yintercept = -log10(0.05), linetype = "dashed") +
  #   geom_vline(xintercept = c(-1, 1), linetype = "dashed") +
  #   scale_color_manual(values = c("no"="#bdbdbd","yes"="#2b8cbe")) +
  #   labs(x = "log2FC", y = expression(-log[10]("adj p")), color = "In DRUGseq DEGs") +
  #   theme_classic()
  # 
  # label a few top overlapping genes
  lab_genes <- top_table[top_table$in_drugseq_deg=="yes", ] |>
    dplyr::arrange(p_value_adj, dplyr::desc(log2FC)) 
  
  volcano_overlap <- ggplot(top_table, aes(x = log2FC, y = -log10(p_value_adj), color = in_drugseq_deg)) +
    geom_point(alpha = 0.6, size = 1.2) +
    geom_text_repel(data = lab_genes, aes(label = gene), size = 3, max.overlaps = 50) +
    scale_color_manual(values = c("no"="#bdbdbd","yes"="#2b8cbe"))+
    theme_classic()
  
  ks <- c(25,50,100,200,500,1000)
  prec_at_k <- sapply(ks, function(k)
    mean(top_table$in_drugseq_deg[1:k] == "yes")
  )
  # plot(ks, prec_at_k, type = "b", xlab = "k (top-k by macpie)",
  #      ylab = "Precision@k (fraction in DRUGseq set)")
  
  ks_plot <- ggplot(data.frame(k=ks, prec=prec_at_k), aes(x=k, y=prec)) +
    geom_point() + geom_line() +
    labs(x = "k (top-k by macpie)", y = "Precision@k (fraction in DRUGseq set)")+
    theme_classic()
  
  # alluvial plot
  macpie_tbl <- top_table %>% filter(p_value_adj < 0.05 & log2FC>0)
  drugseq_tbl <- ff86_res_toptable %>% filter(p_value_adj < 0.05 & log2FC>0)
  alluvial_plot <- plot_alluvial(macpie_tbl = macpie_tbl, drugseq_tbl = drugseq_tbl)
  
  #return 
  result_list <- list(top_table = top_table,
                      num_degs_macpie = length(alldmso_degs),
                      n_overlap = length(intersect(alldmso_degs, drugseq_deg)),
                      ecdf_plot = plt_ecdf,
                      ks_test_results = ks_test_results,
                      volcano_plot = volcano_overlap,
                      ks_plot = ks_plot,
                      alluvial_plot = alluvial_plot)
  return(result_list)
  
})
## 
## Processing method: limma_voom
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning in ks.test.default(top_table$p_value_adj[top_table$in_drugseq_deg == :
## p-value will be approximate in the presence of ties
## 
## Processing method: DESeq2
## converting counts to integer mode
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 1186 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
## Warning in ks.test.default(top_table$p_value_adj[top_table$in_drugseq_deg == :
## p-value will be approximate in the presence of ties
## 
## Processing method: edgeR
## Warning in ks.test.default(top_table$p_value_adj[top_table$in_drugseq_deg == :
## p-value will be approximate in the presence of ties
## 
## Processing method: limma_trend
## Warning: Zero sample variances detected, have been offset away from zero
## Warning: p-value will be approximate in the presence of ties
names(methods_res) <- methods

Summary table

#get a table to show number of DEGs and number of overlapping genes with DRUGseq for each method
deg_summary <- map_df(methods_res, function(x) {
  data.frame(
    num_degs_macpie = x$num_degs_macpie,
    n_overlap = x$n_overlap,
    num_degs_DRUGseq = length(drugseq_deg)
  )
}, .id = paste0("macpie_methods"))

deg_summary
##   macpie_methods num_degs_macpie n_overlap num_degs_DRUGseq
## 1     limma_voom            4280       778             1423
## 2         DESeq2            1959       779             1423
## 3          edgeR            2321       721             1423
## 4    limma_trend            1574       400             1423

Overlapping volcano plot

methods_res$limma_voom$volcano_plot
## Warning: ggrepel: 746 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

methods_res$DESeq2$volcano_plot
## Warning: Removed 831 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: ggrepel: 738 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

methods_res$edgeR$volcano_plot
## Warning: ggrepel: 654 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

methods_res$limma_trend$volcano_plot
## Warning: ggrepel: 368 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Alluvial plot

methods_res$limma_voom$alluvial_plot
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.

methods_res$DESeq2$alluvial_plot
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.

methods_res$edgeR$alluvial_plot
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.

methods_res$limma_trend$alluvial_plot
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.

#append top_table from each method into a single data frame
all_methods_top_table <- map_df(methods_res, "top_table", .id = "method")
head(all_methods_top_table)
##       method       gene   log2FC   metric      p_value  p_value_adj
## 1 limma_voom AL031963.2 1.483560 27.66417 4.119762e-36 8.820411e-32
## 2 limma_voom   Z97192.4 5.026037 26.15530 9.565172e-35 1.023952e-30
## 3 limma_voom    SLC45A4 1.720577 20.68042 3.573334e-29 2.550170e-25
## 4 limma_voom AC007066.1 1.844768 18.51969 1.356781e-22 7.262170e-19
## 5 limma_voom    CYCSP44 1.696008 14.95493 5.065287e-22 1.807463e-18
## 6 limma_voom AP003068.2 1.679486 17.96392 4.522782e-22 1.807463e-18
##   in_drugseq_deg
## 1             no
## 2            yes
## 3            yes
## 4            yes
## 5             no
## 6             no
#if in_drugseq_deg is "no", change method to "DRUGseq"
all_methods_top_table <- all_methods_top_table %>%
  mutate(method = ifelse(in_drugseq_deg == "no", "not in DRUGseq", method))

#rename methods
all_methods_top_table$method <- recode(all_methods_top_table$method,
                                        "limma_voom" = "macpie:limma_voom",
                                        "DESeq2" = "macpie:DESeq2",
                                        "edgeR" = "macpie:edgeR",
                                        "limma_trend" = "macpie:limma_trend")


ggplot(all_methods_top_table, aes(x = p_value_adj, color = method)) +
    stat_ecdf(size = 1) +
    scale_x_continuous(trans = "log10", breaks = c(1e-6,1e-4,5e-2)) +
    labs(x = "Adjusted p-value (log10 scale)", y = "ECDF",
         color = "In DRUGseq DEGs") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
    theme_classic()
## Warning: Removed 831 rows containing non-finite outside the scale range
## (`stat_ecdf()`).

rm(top_table, alldmso_degs, deg_summary)
## Warning in rm(top_table, alldmso_degs, deg_summary): object 'top_table' not
## found
## Warning in rm(top_table, alldmso_degs, deg_summary): object 'alldmso_degs' not
## found

Good DMSO - top 5 from macpie

From public DRUG-seq analysis pipeline, authors identified two reference controls: all DMSO wells and the ‘good DMSO’ wells.

We know these good DMSO wells for batch 24 from their published data:

  • VH02012942: I23, M23

  • VH02012944: D23, H23

  • VH02012956: I23, J23

batch24 <- readRDS(paste0(dir,"DRUGseqData/Exp_batch24.Rds"))
names(batch24)
## [1] "VH02012956" "VH02012942" "VH02012944"
#make a combined metadata for three plates
batch24_metadata <- batch24 %>% 
  map_dfr(~ {
    .x$Annotation %>%
      mutate(
        Plate_ID        = plate_barcode,
        Well_ID         = well_id,
        Barcode         = paste0(plate_barcode, "_", well_index),
        Row             = LETTERS[row],
        Column          = as.integer(col),
        Species         = "human",
        Cell_type       = "U2OS",
        Model_type      = "2D_adherent",
        Time            = as.factor(hours_post_treatment),
        Unit            = "h",
        Treatment_1     = cmpd_sample_id,
        Concentration_1 = as.numeric(concentration),
        Unit_1          = unit,
        Sample_type     = if_else(well_type == "SA" & col == 24,
                                  "Positive Control",
                                  well_type)
      )
  })


batch24_metadata <- batch24_metadata%>%select(-c(batch_id, plate_barcode,plate_index, well_id,
                                                 well_index, col, row, biosample_id, external_biosample_id,
                                                 cmpd_sample_id, well_type, cell_line_name, cell_line_ncn, concentration, unit, hours_post_treatment, Sample))
# create a combined UMI matrix for 3 plates
batch24_counts <- batch24 %>%
  map(~ {
    .x$UMI.counts %>%
      as.data.frame() %>% 
      rownames_to_column("gene_id") %>%
      separate(col = gene_id, into = c("gene_name", "chrom"), sep = ",") %>%
      mutate(gene_name = make.unique(gene_name)) %>%
      select(-chrom) %>%
      tibble::column_to_rownames(var = "gene_name") %>%
      as.matrix()
  })

binded_counts <- do.call(cbind, batch24_counts)
as_mac <- CreateSeuratObject(counts = binded_counts, 
  min.cells = 1, 
  min.features = 1)
## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')
## Warning: Data is of class matrix. Coercing to dgCMatrix.
as_mac<- as_mac%>% inner_join(batch24_metadata, by = c(".cell"="Barcode"))

Filtering steps

Include only good DMSO wells as controls

as_mac$combined_id <- paste0(as_mac$Treatment_1,"_", as_mac$Concentration_1)

badDMSO <- as_mac@meta.data %>% filter(Treatment_1 == "CB-43-EP73") %>% 
 filter((Plate_ID == "VH02012942" & !(Well_ID %in% c("I23", "M23", "K23", "J23","C23"))) |
        (Plate_ID == "VH02012944" & !(Well_ID %in% c("I23", "M23", "J23", "G23", "K23")))|
        (Plate_ID == "VH02012956" & !  (Well_ID %in% c("I23", "J23", "O23","M23","K23"))))



keep_wells <- setdiff(rownames(as_mac@meta.data), rownames(badDMSO))


mac_badDSMOremoved <- as_mac[,keep_wells]

mac_badDSMOremoved$combined_id <- str_replace_all(mac_badDSMOremoved$combined_id, "-","_")
min_sample_num <- min(table(mac_badDSMOremoved$combined_id))
mac_badDSMOremoved <- filter_genes_by_expression(mac_badDSMOremoved,
                                           group_by = "combined_id", min_counts =10,
                                           min_samples = min_sample_num)
mac_badDSMOremoved[["percent.mt"]] <- PercentageFeatureSet(mac_badDSMOremoved, pattern = "^mt-|^MT-")
mac_badDSMOremoved[["percent.ribo"]] <- PercentageFeatureSet(mac_badDSMOremoved, pattern = "^Rp[slp][[:digit:]]|^Rpsa|^RP[SLP][[:digit:]]|^RPSA")
p <- plot_plate_layout(mac_badDSMOremoved, "nCount_RNA", "combined_id") + facet_wrap(~orig.ident, ncol = 1) + 
  theme(strip.text = element_text(size=10),
        axis.text.x = element_text(size=10), 
        axis.text.y = element_text(size=8),
        legend.title = element_text(size=10),
        legend.text = element_text(size=8),
        trip.background = element_blank())
girafe(ggobj = p, 
  fonts = list(sans = "sans"),
  options = list(
    opts_hover(css = "stroke:black; stroke-width:1px;")
  ))
## Warning in plot_theme(plot): The `trip.background` theme element is not defined
## in the element hierarchy.
methods <- c("limma_voom", "DESeq2", "edgeR", "limma_trend")

five_dmso_methods_res <- lapply(methods, function(m){
  message("Processing method: ", m) 
  # m<-"limma_voom"
  treatment_samples <- "FF_86_NH56_10"
  control_samples <- "CB_43_EP73_0"
  subset <- mac_badDSMOremoved%>%filter(combined_id%in%c(treatment_samples,control_samples))

  batch <- subset$orig.ident
  if (m!="limma_trend"){
      # subset <- filter_genes_by_expression(subset,
      #                        group_by = "combined_id", min_counts = 10,
      #                        min_samples = 1)
     badDMSO_out_top_table <- compute_single_de(subset, treatment_samples, control_samples, method =  m, batch = batch)
  } else{
    badDMSO_out_top_table <- limma.trend(data = subset, treatment_samples = treatment_samples, control_samples = control_samples)
  }
 
  # plot(plot_volcano(top_table, max.overlaps = 5))
  badDMSO_out_degs <- badDMSO_out_top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull()
  length(intersect(badDMSO_out_degs, drugseq_deg))
  
  badDMSO_out_top_table <- badDMSO_out_top_table %>%
    arrange(p_value_adj, desc(log2FC)) %>%
    mutate(gene = factor(gene, levels = unique(gene)))
  # add a column if there are in the intersect(alldmso_degs, drugseq_deg)
  badDMSO_out_top_table <- badDMSO_out_top_table %>%
    mutate(in_drugseq_deg = ifelse(gene %in% intersect(badDMSO_out_degs, drugseq_deg), "yes", "no"))
  
  plt_ecdf <- ggplot(badDMSO_out_top_table, aes(x = p_value_adj, color = in_drugseq_deg)) +
    stat_ecdf(size = 1) +
    scale_x_continuous(trans = "log10", breaks = c(1e-6,1e-4,5e-2)) +
    labs(x = "Adjusted p-value (log10 scale)", y = "ECDF",
         color = "In DRUGseq DEGs") +
    theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
    theme_classic()
  
  
  ks_test_results <- ks.test(badDMSO_out_top_table$p_value_adj[badDMSO_out_top_table$in_drugseq_deg=="yes"],
          badDMSO_out_top_table$p_value_adj[badDMSO_out_top_table$in_drugseq_deg=="no"], alternative = "greater")
  
  # ggplot(top_table, aes(x = log2FC, y = -log10(p_value_adj), color = in_drugseq_deg)) +
  #   geom_point(alpha = 0.6, size = 1.2) +
  #   geom_hline(yintercept = -log10(0.05), linetype = "dashed") +
  #   geom_vline(xintercept = c(-1, 1), linetype = "dashed") +
  #   scale_color_manual(values = c("no"="#bdbdbd","yes"="#2b8cbe")) +
  #   labs(x = "log2FC", y = expression(-log[10]("adj p")), color = "In DRUGseq DEGs") +
  #   theme_classic()
  # 
  # label a few top overlapping genes
  lab_genes <- badDMSO_out_top_table[badDMSO_out_top_table$in_drugseq_deg=="yes", ] |>
    dplyr::arrange(p_value_adj, dplyr::desc(log2FC)) 
  
  volcano_overlap <- ggplot(badDMSO_out_top_table, aes(x = log2FC, y = -log10(p_value_adj), color = in_drugseq_deg)) +
    geom_point(alpha = 0.6, size = 1.2) +
    geom_text_repel(data = lab_genes, aes(label = gene), size = 3, max.overlaps = 10) +
    scale_color_manual(values = c("no"="#bdbdbd","yes"="#2b8cbe"))+
    theme_classic()
  
  ks <- c(25,50,100,200,500,1000)
  prec_at_k <- sapply(ks, function(k)
    mean(badDMSO_out_top_table$in_drugseq_deg[1:k] == "yes")
  )
  # plot(ks, prec_at_k, type = "b", xlab = "k (top-k by macpie)",
  #      ylab = "Precision@k (fraction in DRUGseq set)")
  
  ks_plot <- ggplot(data.frame(k=ks, prec=prec_at_k), aes(x=k, y=prec)) +
    geom_point() + geom_line() +
    labs(x = "k (top-k by macpie)", y = "Precision@k (fraction in DRUGseq set)")+
    theme_classic()
  
  # alluvial plot
  macpie_tbl <- badDMSO_out_top_table %>% filter(p_value_adj < 0.05 & log2FC>0)
  drugseq_tbl <- ff86_res_toptable %>% filter(p_value_adj < 0.05 & log2FC>0)
  alluvial_plot <- plot_alluvial(macpie_tbl = macpie_tbl, drugseq_tbl = drugseq_tbl)
  
  #return 
  result_list <- list(top_table = badDMSO_out_top_table,
                      num_degs_macpie = length(badDMSO_out_degs),
                      n_overlap = length(intersect(badDMSO_out_degs, drugseq_deg)),
                      ecdf_plot = plt_ecdf,
                      ks_test_results = ks_test_results,
                      volcano_plot = volcano_overlap,
                      ks_plot = ks_plot,
                      alluvial_plot = alluvial_plot)
  return(result_list)
  
})
## Processing method: limma_voom
## Warning in
## ks.test.default(badDMSO_out_top_table$p_value_adj[badDMSO_out_top_table$in_drugseq_deg
## == : p-value will be approximate in the presence of ties
## Processing method: DESeq2
## converting counts to integer mode
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## Warning in
## ks.test.default(badDMSO_out_top_table$p_value_adj[badDMSO_out_top_table$in_drugseq_deg
## == : p-value will be approximate in the presence of ties
## Processing method: edgeR
## Warning in
## ks.test.default(badDMSO_out_top_table$p_value_adj[badDMSO_out_top_table$in_drugseq_deg
## == : p-value will be approximate in the presence of ties
## Processing method: limma_trend
## Warning: Zero sample variances detected, have been offset away from zero
## Warning: p-value will be approximate in the presence of ties
names(five_dmso_methods_res) <- methods

Summary table

#get a table to show number of DEGs and number of overlapping genes with DRUGseq for each method
deg_summary <- map_df(five_dmso_methods_res, function(x) {
  data.frame(
    num_degs_macpie = x$num_degs_macpie,
    n_overlap = x$n_overlap,
    num_degs_DRUGseq = length(drugseq_deg)
  )
}, .id = paste0("macpie_methods"))

deg_summary
##   macpie_methods num_degs_macpie n_overlap num_degs_DRUGseq
## 1     limma_voom            3456       592             1423
## 2         DESeq2            1107       549             1423
## 3          edgeR            2185       690             1423
## 4    limma_trend               5         2             1423

ECDF plot

#append top_table from each method into a single data frame
good_dmso_top_table <- map_df(five_dmso_methods_res, "top_table", .id = "method")
head(good_dmso_top_table)
##       method       gene   log2FC   metric      p_value  p_value_adj
## 1 limma_voom AL031963.2 1.477162 34.73174 4.135992e-26 1.048226e-21
## 2 limma_voom    SLC45A4 1.746060 25.83312 2.602120e-22 3.297406e-18
## 3 limma_voom   Z97192.4 5.194480 22.09739 2.424226e-20 2.047986e-16
## 4 limma_voom AP003068.2 1.691204 26.07754 6.263377e-20 3.968475e-16
## 5 limma_voom AL583807.1 2.808626 22.32748 2.810497e-18 1.187154e-14
## 6 limma_voom AC007066.1 1.833326 23.05883 2.624228e-18 1.187154e-14
##   in_drugseq_deg
## 1             no
## 2            yes
## 3            yes
## 4             no
## 5             no
## 6            yes
#if in_drugseq_deg is "no", change method to "DRUGseq"
good_dmso_top_table<- good_dmso_top_table %>%
  mutate(method = ifelse(in_drugseq_deg == "no", "not_in_DRUGseq", method))

#rename methods
good_dmso_top_table$method <- recode(good_dmso_top_table$method,
                                        "limma_voom" = "macpie:limma_voom",
                                        "DESeq2" = "macpie:DESeq2",
                                        "edgeR" = "macpie:edgeR",
                                        "limma_trend" = "macpie:limma_trend")


ggplot(good_dmso_top_table, aes(x = p_value_adj, color = method)) +
    stat_ecdf(size = 1) +
    scale_x_continuous(trans = "log10", breaks = c(1e-6,1e-4,5e-2)) +
    labs(x = "Adjusted p-value (log10 scale)", y = "ECDF",
         color = "In DRUGseq DEGs") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
    theme_classic()
## Warning: Removed 12331 rows containing non-finite outside the scale range
## (`stat_ecdf()`).

Overlapping volcano plot

five_dmso_methods_res$limma_voom$volcano_plot
## Warning: ggrepel: 575 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

five_dmso_methods_res$DESeq2$volcano_plot
## Warning: Removed 12331 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: ggrepel: 536 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

five_dmso_methods_res$edgeR$volcano_plot
## Warning: ggrepel: 662 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

five_dmso_methods_res$limma_trend$volcano_plot

Alluvial plot

five_dmso_methods_res$limma_voom$alluvial_plot
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.

five_dmso_methods_res$DESeq2$alluvial_plot
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.

five_dmso_methods_res$edgeR$alluvial_plot
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.

five_dmso_methods_res$limma_trend$alluvial_plot
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.

Good DMSO - DRUGseq

From public DRUG-seq analysis pipeline, authors identified two reference controls: all DMSO wells and the ‘good DMSO’ wells.

We know these good DMSO wells for batch 24 from their published data:

  • VH02012942: I23, M23

  • VH02012944: D23, H23

  • VH02012956: I23, J23

batch24 <- readRDS(paste0(dir,"DRUGseqData/Exp_batch24.Rds"))
names(batch24)
## [1] "VH02012956" "VH02012942" "VH02012944"
#make a combined metadata for three plates
batch24_metadata <- batch24 %>% 
  map_dfr(~ {
    .x$Annotation %>%
      mutate(
        Plate_ID        = plate_barcode,
        Well_ID         = well_id,
        Barcode         = paste0(plate_barcode, "_", well_index),
        Row             = LETTERS[row],
        Column          = as.integer(col),
        Species         = "human",
        Cell_type       = "U2OS",
        Model_type      = "2D_adherent",
        Time            = as.factor(hours_post_treatment),
        Unit            = "h",
        Treatment_1     = cmpd_sample_id,
        Concentration_1 = as.numeric(concentration),
        Unit_1          = unit,
        Sample_type     = if_else(well_type == "SA" & col == 24,
                                  "Positive Control",
                                  well_type)
      )
  })


batch24_metadata <- batch24_metadata%>%select(-c(batch_id, plate_barcode,plate_index, well_id,
                                                 well_index, col, row, biosample_id, external_biosample_id,
                                                 cmpd_sample_id, well_type, cell_line_name, cell_line_ncn, concentration, unit, hours_post_treatment, Sample))
# create a combined UMI matrix for 3 plates
batch24_counts <- batch24 %>%
  map(~ {
    .x$UMI.counts %>%
      as.data.frame() %>% 
      rownames_to_column("gene_id") %>%
      separate(col = gene_id, into = c("gene_name", "chrom"), sep = ",") %>%
      mutate(gene_name = make.unique(gene_name)) %>%
      select(-chrom) %>%
      tibble::column_to_rownames(var = "gene_name") %>%
      as.matrix()
  })

binded_counts <- do.call(cbind, batch24_counts)
as_mac <- CreateSeuratObject(counts = binded_counts, 
  min.cells = 1, 
  min.features = 1)
## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')
## Warning: Data is of class matrix. Coercing to dgCMatrix.
as_mac<- as_mac%>% inner_join(batch24_metadata, by = c(".cell"="Barcode"))

Filtering steps

Include only good DMSO wells as controls

as_mac$combined_id <- paste0(as_mac$Treatment_1,"_", as_mac$Concentration_1)

badDMSO <- as_mac@meta.data %>% filter(Treatment_1 == "CB-43-EP73") %>% 
 filter((Plate_ID == "VH02012942" & !(Well_ID %in% c("I23", "M23"))) |
        (Plate_ID == "VH02012944" & !(Well_ID %in% c("D23", "H23")))|
        (Plate_ID == "VH02012956" & !  (Well_ID %in% c("I23", "J23"))))



keep_wells <- setdiff(rownames(as_mac@meta.data), rownames(badDMSO))


mac_badDSMOremoved <- as_mac[,keep_wells]

mac_badDSMOremoved$combined_id <- str_replace_all(mac_badDSMOremoved$combined_id, "-","_")
min_sample_num <- min(table(mac_badDSMOremoved$combined_id))
mac_badDSMOremoved <- filter_genes_by_expression(mac_badDSMOremoved,
                                           group_by = "combined_id", min_counts =10,
                                           min_samples = min_sample_num)
mac_badDSMOremoved[["percent.mt"]] <- PercentageFeatureSet(mac_badDSMOremoved, pattern = "^mt-|^MT-")
mac_badDSMOremoved[["percent.ribo"]] <- PercentageFeatureSet(mac_badDSMOremoved, pattern = "^Rp[slp][[:digit:]]|^Rpsa|^RP[SLP][[:digit:]]|^RPSA")
methods <- c("limma_voom", "DESeq2", "edgeR", "limma_trend")

good_dmso_methods_res <- lapply(methods, function(m){
  message("Processing method: ", m) 
  # m<-"limma_voom"
  treatment_samples <- "FF_86_NH56_10"
  control_samples <- "CB_43_EP73_0"
  subset <- mac_badDSMOremoved%>%filter(combined_id%in%c(treatment_samples,control_samples))

  batch <- subset$orig.ident
  if (m!="limma_trend"){
      # subset <- filter_genes_by_expression(subset,
      #                        group_by = "combined_id", min_counts = 10,
      #                        min_samples = 1)
     badDMSO_out_top_table <- compute_single_de(subset, treatment_samples, control_samples, method =  m, batch = batch)
  } else{
    badDMSO_out_top_table <- limma.trend(data = subset, treatment_samples = treatment_samples, control_samples = control_samples)
  }
 
  # plot(plot_volcano(top_table, max.overlaps = 5))
  badDMSO_out_degs <- badDMSO_out_top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull()
  length(intersect(badDMSO_out_degs, drugseq_deg))
  
  badDMSO_out_top_table <- badDMSO_out_top_table %>%
    arrange(p_value_adj, desc(log2FC)) %>%
    mutate(gene = factor(gene, levels = unique(gene)))
  # add a column if there are in the intersect(alldmso_degs, drugseq_deg)
  badDMSO_out_top_table <- badDMSO_out_top_table %>%
    mutate(in_drugseq_deg = ifelse(gene %in% intersect(badDMSO_out_degs, drugseq_deg), "yes", "no"))
  
  plt_ecdf <- ggplot(badDMSO_out_top_table, aes(x = p_value_adj, color = in_drugseq_deg)) +
    stat_ecdf(size = 1) +
    scale_x_continuous(trans = "log10", breaks = c(1e-6,1e-4,5e-2)) +
    labs(x = "Adjusted p-value (log10 scale)", y = "ECDF",
         color = "In DRUGseq DEGs") +
    theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
    theme_classic()
  
  
  ks_test_results <- ks.test(badDMSO_out_top_table$p_value_adj[badDMSO_out_top_table$in_drugseq_deg=="yes"],
          badDMSO_out_top_table$p_value_adj[badDMSO_out_top_table$in_drugseq_deg=="no"], alternative = "greater")
  
  # ggplot(top_table, aes(x = log2FC, y = -log10(p_value_adj), color = in_drugseq_deg)) +
  #   geom_point(alpha = 0.6, size = 1.2) +
  #   geom_hline(yintercept = -log10(0.05), linetype = "dashed") +
  #   geom_vline(xintercept = c(-1, 1), linetype = "dashed") +
  #   scale_color_manual(values = c("no"="#bdbdbd","yes"="#2b8cbe")) +
  #   labs(x = "log2FC", y = expression(-log[10]("adj p")), color = "In DRUGseq DEGs") +
  #   theme_classic()
  # 
  # label a few top overlapping genes
  lab_genes <- badDMSO_out_top_table[badDMSO_out_top_table$in_drugseq_deg=="yes", ] |>
    dplyr::arrange(p_value_adj, dplyr::desc(log2FC)) 
  
  volcano_overlap <- ggplot(badDMSO_out_top_table, aes(x = log2FC, y = -log10(p_value_adj), color = in_drugseq_deg)) +
    geom_point(alpha = 0.6, size = 1.2) +
    geom_text_repel(data = lab_genes, aes(label = gene), size = 3, max.overlaps = 10) +
    scale_color_manual(values = c("no"="#bdbdbd","yes"="#2b8cbe"))+
    theme_classic()
  
  ks <- c(25,50,100,200,500,1000)
  prec_at_k <- sapply(ks, function(k)
    mean(badDMSO_out_top_table$in_drugseq_deg[1:k] == "yes")
  )
  # plot(ks, prec_at_k, type = "b", xlab = "k (top-k by macpie)",
  #      ylab = "Precision@k (fraction in DRUGseq set)")
  
  ks_plot <- ggplot(data.frame(k=ks, prec=prec_at_k), aes(x=k, y=prec)) +
    geom_point() + geom_line() +
    labs(x = "k (top-k by macpie)", y = "Precision@k (fraction in DRUGseq set)")+
    theme_classic()
  
  # alluvial plot
  macpie_tbl <- badDMSO_out_top_table %>% filter(p_value_adj < 0.05 & log2FC>0)
  drugseq_tbl <- ff86_res_toptable %>% filter(p_value_adj < 0.05 & log2FC>0)
  alluvial_plot <- plot_alluvial(macpie_tbl = macpie_tbl, drugseq_tbl = drugseq_tbl)
  
  #return 
  result_list <- list(top_table = badDMSO_out_top_table,
                      num_degs_macpie = length(badDMSO_out_degs),
                      n_overlap = length(intersect(badDMSO_out_degs, drugseq_deg)),
                      ecdf_plot = plt_ecdf,
                      ks_test_results = ks_test_results,
                      volcano_plot = volcano_overlap,
                      ks_plot = ks_plot,
                      alluvial_plot = alluvial_plot)
  return(result_list)
  
})
## Processing method: limma_voom
## Warning in
## ks.test.default(badDMSO_out_top_table$p_value_adj[badDMSO_out_top_table$in_drugseq_deg
## == : p-value will be approximate in the presence of ties
## Processing method: DESeq2
## converting counts to integer mode
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## Warning in
## ks.test.default(badDMSO_out_top_table$p_value_adj[badDMSO_out_top_table$in_drugseq_deg
## == : p-value will be approximate in the presence of ties
## Processing method: edgeR
## Warning in
## ks.test.default(badDMSO_out_top_table$p_value_adj[badDMSO_out_top_table$in_drugseq_deg
## == : p-value will be approximate in the presence of ties
## Processing method: limma_trend
## Warning: Zero sample variances detected, have been offset away from zero
## Warning: p-value will be approximate in the presence of ties
names(good_dmso_methods_res) <- methods

Summary table

#get a table to show number of DEGs and number of overlapping genes with DRUGseq for each method
deg_summary <- map_df(good_dmso_methods_res, function(x) {
  data.frame(
    num_degs_macpie = x$num_degs_macpie,
    n_overlap = x$n_overlap,
    num_degs_DRUGseq = length(drugseq_deg)
  )
}, .id = paste0("macpie_methods"))

deg_summary
##   macpie_methods num_degs_macpie n_overlap num_degs_DRUGseq
## 1     limma_voom            1352       204             1423
## 2         DESeq2              10         8             1423
## 3          edgeR            1757       604             1423
## 4    limma_trend               2         1             1423

ECDF plot

#append top_table from each method into a single data frame
good_dmso_top_table <- map_df(good_dmso_methods_res, "top_table", .id = "method")
head(good_dmso_top_table)
##       method       gene   log2FC   metric       p_value   p_value_adj
## 1 limma_voom AL031963.2 1.450009 52.04866  0.000000e+00  0.000000e+00
## 2 limma_voom  RNU6-633P 1.254947 72.59594  0.000000e+00  0.000000e+00
## 3 limma_voom    ELOCP10 1.269453 35.96512 2.086923e-279 1.762685e-275
## 4 limma_voom AP003068.2 1.677015 35.65408 1.072200e-274 6.792120e-271
## 5 limma_voom AC007066.1 1.821885 32.63913 4.558729e-220 2.310273e-216
## 6 limma_voom    SLC45A4 1.745313 31.67705 6.854635e-218 2.894826e-214
##   in_drugseq_deg
## 1             no
## 2             no
## 3             no
## 4             no
## 5            yes
## 6            yes
#if in_drugseq_deg is "no", change method to "DRUGseq"
good_dmso_top_table<- good_dmso_top_table %>%
  mutate(method = ifelse(in_drugseq_deg == "no", "not_in_DRUGseq", method))

#rename methods
good_dmso_top_table$method <- recode(good_dmso_top_table$method,
                                        "limma_voom" = "macpie:limma_voom",
                                        "DESeq2" = "macpie:DESeq2",
                                        "edgeR" = "macpie:edgeR",
                                        "limma_trend" = "macpie:limma_trend")


ggplot(good_dmso_top_table, aes(x = p_value_adj, color = method)) +
    stat_ecdf(size = 1) +
    scale_x_continuous(trans = "log10", breaks = c(1e-6,1e-4,5e-2)) +
    labs(x = "Adjusted p-value (log10 scale)", y = "ECDF",
         color = "In DRUGseq DEGs") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
    theme_classic()
## Warning in scale_x_continuous(trans = "log10", breaks = c(1e-06, 1e-04, :
## log-10 transformation introduced infinite values.
## Warning: Removed 19707 rows containing non-finite outside the scale range
## (`stat_ecdf()`).

good_dmso_methods_res$limma_voom$ecdf_plot
## Warning in scale_x_continuous(trans = "log10", breaks = c(1e-06, 1e-04, :
## log-10 transformation introduced infinite values.
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_ecdf()`).

good_dmso_methods_res$DESeq2$ecdf_plot
## Warning: Removed 19705 rows containing non-finite outside the scale range
## (`stat_ecdf()`).

good_dmso_methods_res$edgeR$ecdf_plot

good_dmso_methods_res$limma_trend$ecdf_plot

### Overlapping volcano plot

good_dmso_methods_res$limma_voom$volcano_plot
## Warning: ggrepel: 196 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

good_dmso_methods_res$DESeq2$volcano_plot
## Warning: Removed 19705 rows containing missing values or values outside the scale range
## (`geom_point()`).

good_dmso_methods_res$edgeR$volcano_plot
## Warning: ggrepel: 583 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

good_dmso_methods_res$limma_trend$volcano_plot

Alluvial plot

good_dmso_methods_res$limma_voom$alluvial_plot
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.

good_dmso_methods_res$DESeq2$alluvial_plot
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.

good_dmso_methods_res$edgeR$alluvial_plot
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.

good_dmso_methods_res$limma_trend$alluvial_plot
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.

Comparison

To compare DEGs with different replicate numbers and different methods

methods <- c("limma_voom", "DESeq2", "edgeR", "limma_trend")

get_jaccard <- function(deg_set, drugseq_deg){
  intersection <- length(intersect(deg_set, drugseq_deg))
  union <- length(union(deg_set, drugseq_deg))
  jaccard_index <- intersection / union
  return(jaccard_index)
}

jaccard_index <- lapply(methods, function(m){
  # all dmso
  degs <- methods_res[[m]]$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull()
  jaccard_all <- get_jaccard(degs, drugseq_deg)
  # five dmso
  degs <- five_dmso_methods_res[[m]]$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull()
  jaccard_five <- get_jaccard(degs, drugseq_deg)
  # three dmso
  degs <- good_dmso_methods_res[[m]]$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull()
  jaccard_three <- get_jaccard(degs, drugseq_deg)
  jaccard_index <- data.frame(
    method = m,
    jaccard_all = jaccard_all,
    jaccard_five = jaccard_five,
    jaccard_three = jaccard_three
  )
  return(jaccard_index)
})

df <- as.data.frame(do.call(rbind, jaccard_index))
rownames(df) <- df$method
df <- df %>% select(-method)
colnames(df) <- c("All DMSO", "macpie: 15 DMSO", "DRUGseq: 6 DMSO")
pheatmap::pheatmap(df,
         cluster_rows = FALSE,
         cluster_cols = FALSE,
         display_numbers = TRUE,
         main = "Jaccard Index between macpie DEGs and DRUGseq DEGs")

library(UpSetR)
all_dmso <- list(
  limma_voom = methods_res$limma_voom$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(),
  DESeq2 = methods_res$DESeq2$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(),
  edgeR = methods_res$edgeR$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(),
  limma_trend = methods_res$limma_trend$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(),
  DRUGseq = drugseq_deg
)
upset(fromList(all_dmso), 
      nsets = 5, 
      order.by = "freq",
      main.bar.color = "black",
      sets.bar.color = "gray23",
      text.scale = c(2, 2, 2, 1.5, 2, 1.5))

five_dmso <- list(
  limma_voom = five_dmso_methods_res$limma_voom$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(),
  DESeq2 = five_dmso_methods_res$DESeq2$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(),
  edgeR = five_dmso_methods_res$edgeR$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(),
  limma_trend = five_dmso_methods_res$limma_trend$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(),
  DRUGseq = drugseq_deg
)
upset(fromList(five_dmso), 
      nsets = 5, 
      order.by = "freq",
      main.bar.color = "black",
      sets.bar.color = "gray23",
      text.scale = c(2, 2, 2, 1.5, 2, 1.5))

good_dmso <- list(
  limma_voom = good_dmso_methods_res$limma_voom$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(),
  DESeq2 = good_dmso_methods_res$DESeq2$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(),
  edgeR = good_dmso_methods_res$edgeR$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(),
  limma_trend = good_dmso_methods_res$limma_trend$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(),
  DRUGseq = drugseq_deg
)
upset(fromList(good_dmso), 
      nsets = 5, 
      order.by = "freq",
      main.bar.color = "black",
      sets.bar.color = "gray23",
      text.scale = c(2, 2, 2, 1.5, 2, 1.5))